#!/bin/bash
#BSUB -P CSC249ADCD03
#BSUB -J AMREX_GPU
#BSUB -o AMREX_GPU.o%J
#BSUB -e AMREX_GPU.e%J
#BSUB -W 10 
#BSUB -nnodes 1
#BSUB -alloc_flags "smt1" 

## BSUB -alloc_flags gpumps
# Turn on MPS. Needed when running multiple MPI ranks per GPU. 
# For standard runs, this is not needed. 

module list
set -x

# =====================
# Find the flags you need using jsrun visualizer:
# https://jsrunvisualizer.olcf.ornl.gov
#
# BSUB parameters
# -nnodes = number of nodes

# JSRUN parameters
# -n: number of resource sets
#   (alternatively,
# -r: number of resource sets per node)
# -a: number of MPI tasks/ranks per resource set
# -c: number of CPU cores per resource set
# -g: number of GPUs per resource set

# Summit: Each node has:
#         2  Sockets
#         3  GPUs per socket (6 total)
#         21 CPUs per socket (42 total)
#         4  Hardware Threads per CPU (168 total)
#
# On Summit: Recommended initial configurations:
#             one GPU per MPI rank,
#             one resource set per socket
#             (-r2 -c21 -g3 -a3 --bind=packed:7)
#             or one resource set per GPU
#             (-r6  -c7 -g1 -a1 --bind=packed:7)
# =====================
 
omp=1
export OMP_NUM_THREADS=${omp}

EXE="./main3d.pgi.MPI.CUDA.ex"
JSRUN="jsrun -r 6 -a 1 -g 1 -c 7 --bind=packed:7"
INPUTS=inputs_3d
SMPIARGS=

# Runtime flags
# =====================
# Flag to disable CUDA-aware MPI functionality. 
# Unneeded normally.
# However, if not using CUDA-aware MPI (off by default), useful to 
PAMI_DISABLE_IPC=1

# SMPIARGS+=" --smpiargs="-gpu""
# Allows CUDA-aware MPI functionality.
# Only needed if amrex.use_gpu_aware_mpi=1.
# Requires using one resource set per node.
# RECOMMENDED OFF. ONLY ADD IF YOU REQUIRE IT.

SMPIARGS+=" --smpiargs="-disable_gpu_hooks -x PAMI_DISABLE_CUDA_HOOK=1""
# Turns off CUDA hooks required for CUDA-aware MPI. 
# RECOMMENDED ON.
# May eliminate intermittent finalize bug that prevents obtaining profiling data.

# Profiler flags
# =====================
# --openmp-profiling off
# Needed when using nvprof with Cuda >= 10 on summit, see Resolved Issues:
# https://docs.olcf.ornl.gov/systems/summit_user_guide.html

# --device-buffer-size <size in MBs> (recommended initial value: 64)
# Reserves memory for storing profiling data for non-CDP operations for each buffer on a
#       context. The default value is 8MB. 
# Ensures sufficient memory so that profiling information written to stdout is reported
#       in cases where HBM is over-subscribed (large memory problems).

# Example run lines
# =====================

# 1. Run normally
${JSRUN} ${SMPIARGS} ${EXE} ${INPUTS} >& out.${LSB_JOBID}

# 2. Run under nsys profile and store performance data in a qdrep file
#${JSRUN} ${SMPIARGS} nsys profile -o nsys_out.%q{PMIX_RANK}.%q{LSB_JOBID}  ${EXE} ${INPUTS} >& out.${LSB_JOBID}

# 3. Run under nsys profile and store performance data in a qdrep file and collect summary stats in sqlite file
#    Warning: summary statistics will take A LOT of extra time at the end of the run
#${JSRUN} ${SMPIARGS} nsys profile --stats=true -o nsys_out.%q{PMIX_RANK}.%q{LSB_JOBID}  ${EXE} ${INPUTS} >& out.${LSB_JOBID}

# 4. Run under nv-sight-cu-cli for Nsight Compute command line interface
#    Warning: This will do a A LOT of analysis. Limit the kernels profiled with additional flags.
#    For filtering examples, see:
#    https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html#nvtx-filtering
#${JSRUN} ${SMPIARGS} nv-nsight-cu-cli -o cucli_out.%q{SLURM_PROCID}.%q{SLURM_JOBID} ${EXE} ${INPUTS} >& out.${LSB_JOBID}

